In [300]:
import numpy as np
from skimage import io
In [301]:
import os
train_directory = "./img/grey/"
In [302]:
def images(image_directory):
return [image_directory+image for image in os.listdir(image_directory)]
images(train_directory)
Out[302]:
In [303]:
train_image_names = images(train_directory)
In [304]:
# Function to extract labels
def extract_labels(file_names):
'''Create labels from file names: kfc = 0 and mcd = 1 and sub = 2'''
# Create empty vector of length = no. of files, filled with zeros
n = len(file_names)
y = np.zeros(n, dtype = np.uint8)
# Enumerate gives index
for i, filename in enumerate(file_names):
# If 'kfc' string is in file name assign '0'
if 'kfc' in str(filename):
y[i] = 0
elif 'mcd' in str(filename):
y[i] = 1
else :
y[i] = 2
return y
extract_labels(train_image_names)
Out[304]:
In [305]:
y = extract_labels(train_image_names)
# Save labels: np.save(file or string, array)
np.save('y', y)
In [306]:
from PIL import Image
In [307]:
def img_to_matrix(filename):
'''
takes a filename and turns it into a numpy array of RGB pixels
'''
img = Image.open(filename)
# img = Image.fromarray(filename)
img = list(img.getdata())
img = np.asarray(img)
return img
In [308]:
data = []
for i in images(train_directory):
img = img_to_matrix(i)
data.append(img)
data = np.array(data)
data.shape
Out[308]:
In [309]:
np.savetxt("./img/train.txt", data)
In [310]:
import numpy as np
import math
import matplotlib.pyplot as plt
from PIL import Image
In [311]:
X_train = np.loadtxt('./img/train.txt')
In [312]:
print("Shape of training set: {}".format(X_train.shape))
In [313]:
def image_grid(D,H,W,cols=10,scale=1):
""" display a grid of images
H,W: Height and width of the images
cols: number of columns = number of images in each row
scale: 1 to fill screen
"""
n = np.shape(D)[0]
rows = int(math.ceil((n+0.0)/cols))
fig = plt.figure(1,figsize=[scale*20.0/H*W,scale*20.0/cols*rows],dpi=300)
for i in range(n):
plt.subplot(rows,cols,i+1)
fig=plt.imshow(np.reshape(D[i,:],[H,W]), cmap = plt.get_cmap("gray"))
plt.axis('off')
H = 100
W = 500
In [314]:
mean_image = np.mean(X_train, axis=0)
plt.imshow(np.reshape(mean_image,[H,W]), cmap = plt.get_cmap("gray"))
plt.show()
In [315]:
from sklearn.decomposition import PCA
n_components = 40
In [316]:
pca = PCA(n_components=n_components, svd_solver='randomized',
whiten=True).fit(X_train)
In [317]:
pca_result = pca.transform(X_train)
print(X_train.shape)
print(pca_result.shape)
print(y.shape)
x_train= X_train.copy() #for autoML
Y_train= y.copy
In [318]:
%matplotlib inline
plt.hist(pca.explained_variance_ratio_, bins=n_components, log=True)
Out[318]:
In [319]:
import pandas as pd
pca.explained_variance_ratio_.sum()
Out[319]:
In [320]:
labels= pd.DataFrame(y)
value= pd.DataFrame(X_train)
from sklearn.decomposition import PCA
n_components = 100
pca = PCA(n_components=n_components, svd_solver='randomized',
whiten=True).fit(value)
pca_result = pca.transform(X_train)
print(value.shape)
print(pca_result.shape)
In [321]:
value.head()
Out[321]:
In [322]:
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
kmeans = KMeans(n_clusters=3)
ksv= kmeans.fit(value, labels) #kmeans
In [323]:
tsv= SVC(kernel='rbf').fit(value, labels) # svm
In [324]:
knn= KNeighborsClassifier() #knn
knn.fit(value, labels)
Out[324]:
In [325]:
tt= np.loadtxt('./img/test_x.txt')
value2= pd.DataFrame(tt)
value2.head()
Out[325]:
In [326]:
knn.predict(value2)
Out[326]:
In [327]:
ksv.predict(value2)
Out[327]:
In [328]:
tsv.predict(value2)
Out[328]:
In [329]:
from sklearn.metrics import precision_score
In [330]:
#print("accuracy"+ str(roc_auc_score(value2, tsv.predict(value2))))
In [331]:
from sklearn.model_selection import train_test_split
In [332]:
pca_result = data
In [333]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2,
random_state=0)
In [334]:
knn = KNeighborsClassifier(n_neighbors=1)
model= knn.fit(X_train, y_train)
In [335]:
predictions= knn.predict(X_test)
In [336]:
print (model.score(X_test, y_test))
In [337]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
In [338]:
Kmeans_model= kmeans.fit(X_train, y_test)
In [339]:
predictions= kmeans.predict(X_test)
In [340]:
print(Kmeans_model.score(X_test, y_test))
In [341]:
from sklearn import svm
svm= svm.SVC(kernel='linear', C=1)
In [342]:
svm_model= svm.fit(X_train, y_train)
In [343]:
predictions= svm.predict(X_test)
In [344]:
print(svm_model.score(X_test, y_test))
In [345]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
In [346]:
scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
print("knn")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
In [ ]:
In [347]:
scores = cross_val_score(kmeans, X_train, y_train, cv=5, scoring='accuracy')
print("kmeans")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
In [ ]:
In [348]:
scores = cross_val_score(svm, X_train, y_train, cv=5, scoring='accuracy')
print("svm")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
In [349]:
scores
Out[349]:
In [350]:
""""
from tpot import TPOTClassifier
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, n_jobs=-1, cv= 5)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
Best pipeline: GaussianNB(LogisticRegression(input_matrix, C=0.1, dual=False, penalty=l1))
0.363636363636
No significant improvement on this dataset
"""
Out[350]:
In [ ]:
The generated pipeline from the autoML does not have a better accuracy than our best performing SVM classifier.
Improvements can be made by using Convolutional Neural Networks, increasing the training data or by creating an ensemble of algorithms.
In [351]:
import mlxtend
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[kmeans, svm, knn],
meta_classifier=lr)
In [352]:
print('5-fold cross validation:\n')
for clf, label in zip([kmeans, svm, knn, sclf],
['kmeans',
'svm',
'knn',
'StackingClassifier']):
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
By stacking the classifiers, we are able to improve the overall accuracy to 0.53
In [ ]: